Risk Deep Dive¶
Purpose: Detailed risk analysis for data science / actuarial review
Run via: make run-notebooks MODE=nyc or make all-both
In [1]:
# ============================================================
# SETUP
# ============================================================
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
# Configuration from Makefile
RUN_DIR = os.environ.get("CITIBIKE_RUN_DIR")
mode = os.environ.get("CITIBIKE_MODE", "unknown")
if not RUN_DIR:
raise ValueError("CITIBIKE_RUN_DIR not set.\nRun via Makefile: make run-notebooks MODE=nyc")
RUN_DIR = Path(RUN_DIR)
run_label = RUN_DIR.name
print(f"Mode: {mode}")
print(f"Run directory: {RUN_DIR}")
print(f"Run label: {run_label}")
# Load CSVs
df_year = pd.read_csv(RUN_DIR / "citibike_trips_by_year.csv")
df_month = pd.read_csv(RUN_DIR / "citibike_trips_by_month.csv")
# Scorecard (try 500m first)
scorecard_path = RUN_DIR / "axa_partner_scorecard_500m.csv"
if not scorecard_path.exists():
for p in RUN_DIR.glob("axa_partner_scorecard_*m.csv"):
scorecard_path = p
break
df_score = pd.read_csv(scorecard_path) if scorecard_path.exists() else None
print(f"\nLoaded:")
print(f" df_year: {len(df_year)} rows")
print(f" df_month: {len(df_month)} rows")
print(f" df_score: {len(df_score) if df_score is not None else 'None'} rows")
if df_score is not None:
print(f"\nScorecard columns: {list(df_score.columns)}")
if "year" in df_score.columns:
print(f"Years: {sorted(df_score['year'].dropna().unique())}")
if "month" in df_score.columns:
print(f"Months: {sorted(df_score['month'].dropna().unique())}")
# Figure output
FIG_DIR = RUN_DIR.parent.parent / "reports" / run_label / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)
def savefig(name):
path = FIG_DIR / name
plt.savefig(path, dpi=150, bbox_inches="tight")
print(f"Saved: {path}")
Mode: nyc Run directory: /home/hamzeh-khanpour/Desktop/CITIBIK-main/summaries/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc Run label: y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc
Loaded: df_year: 4 rows df_month: 48 rows df_score: 72466 rows Scorecard columns: ['mode', 'year', 'month', 'start_station_id', 'start_station_name', 'station_lat', 'station_lng', 'exposure_trips', 'crash_count', 'risk_rate_per_100k_trips', 'risk_rate_ci_low', 'risk_rate_ci_high', 'risk_proxy_available', 'credibility_flag', 'exposure_pct', 'risk_pct', 'axa_priority_score', 'prevention_hotspot', 'product_hotspot', 'acquisition_hotspot', 'exposure_index_pct', 'eb_risk_rate_per_100k_trips', 'risk_index_pct', 'expected_incidents_proxy', 'scoring_strategy', 'eb_m_prior_used'] Years: [np.int64(2019), np.int64(2020), np.int64(2023), np.int64(2025)] Months: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12)]
In [2]:
# --- HIGH-RISK STATIONS ---
MIN_TRIPS = 5000
TOP_N = 10
HIGH_RISK_PCT = 90
if df_score is None or len(df_score) == 0:
print("No scoring data found (df_score is missing/empty).")
else:
print("\n" + "=" * 70)
print("HIGH-RISK STATIONS ANALYSIS")
print("=" * 70)
s = df_score.copy()
# Normalize column names
s = s.rename(
columns={
"start_station_id": "station_id",
"start_station_name": "station_name",
"exposure_trips": "trips",
},
errors="ignore",
)
# Identify columns
id_col = "station_id" if "station_id" in s.columns else None
name_col = "station_name" if "station_name" in s.columns else None
rate_col = (
"eb_risk_rate_per_100k_trips"
if "eb_risk_rate_per_100k_trips" in s.columns
else "risk_rate_per_100k_trips"
)
has_year = "year" in s.columns
has_month = "month" in s.columns
# Clean numeric columns
s["trips"] = pd.to_numeric(s.get("trips", 0), errors="coerce")
s[rate_col] = pd.to_numeric(s[rate_col], errors="coerce")
if "crash_count" in s.columns:
s["crash_count"] = pd.to_numeric(s["crash_count"], errors="coerce")
if has_year:
s["year"] = pd.to_numeric(s["year"], errors="coerce").astype("Int64")
if has_month:
s["month"] = pd.to_numeric(s["month"], errors="coerce").astype("Int64")
# Filter to credible
if "credibility_flag" in s.columns:
credible = s[s["credibility_flag"] == "credible"].copy()
else:
credible = s[s["trips"] >= MIN_TRIPS].copy()
print(f"Note: no credibility_flag column; using trips ≥ {MIN_TRIPS:,} as credible.")
credible = credible.dropna(subset=["trips", rate_col])
credible = credible[credible["trips"] >= MIN_TRIPS].copy()
if len(credible) == 0:
print("No credible rows with usable risk data.")
else:
# High risk threshold
cutoff = float(np.nanpercentile(credible[rate_col].values, HIGH_RISK_PCT))
high_risk = credible[credible[rate_col] >= cutoff].copy()
print(
f"High-risk threshold: {HIGH_RISK_PCT}th percentile of {rate_col} among credible rows"
)
print(f"Cutoff value: {cutoff:.2f}")
print(f"High-risk rows: {len(high_risk):,}")
print(f"Total exposure in high-risk rows: {high_risk['trips'].sum():,.0f} trips")
# Table columns
cols_to_show = [
c
for c in [
"mode",
"year" if has_year else None,
"month" if has_month else None,
id_col,
name_col,
"station_lat",
"station_lng",
"trips",
"crash_count",
rate_col,
]
if c and c in high_risk.columns
]
top_risk = high_risk.sort_values([rate_col, "trips"], ascending=False).head(TOP_N)[
cols_to_show
]
print(f"\nTop {TOP_N} Highest-Risk (credible) station-periods:")
display(top_risk)
# Scope info
if has_year and has_month:
scope = (
s.dropna(subset=["year", "month"])
.groupby(["year", "month"])
.size()
.reset_index(name="rows")
)
print("\nScorecard scope (rows by year-month):")
print(scope.sort_values(["year", "month"]).to_string(index=False))
# Plot
top_plot = top_risk.sort_values(rate_col, ascending=True).copy()
fig, ax = plt.subplots(figsize=(12, 6))
y = np.arange(len(top_plot))
vals = top_plot[rate_col].values
ax.barh(y, vals, edgecolor="black")
ax.set_yticks(y)
ax.set_yticklabels([str(n)[:40] for n in top_plot[name_col]], fontsize=9)
ax.set_xlabel("Risk rate per 100k trips", fontsize=11)
ax.set_title(
f"Top {TOP_N} Highest-Risk Station-Periods (Credible)", fontsize=12, fontweight="bold"
)
ax.grid(axis="x", alpha=0.3)
# Right-side year/month columns
x_max = float(np.nanmax(vals)) if len(vals) else 1.0
pad = 0.18 * x_max
ax.set_xlim(0, x_max + pad)
x_sep = x_max + pad * 0.08
x_year = x_max + pad * 0.35
x_month = x_max + pad * 0.70
ax.axvline(x_sep, color="gray", linewidth=1, alpha=0.6)
if has_year and has_month:
ax.text(
x_year,
len(y) - 0.15,
"Year",
ha="center",
va="bottom",
fontsize=10,
fontweight="bold",
)
ax.text(
x_month,
len(y) - 0.15,
"Month",
ha="center",
va="bottom",
fontsize=10,
fontweight="bold",
)
for i, (_, row) in enumerate(top_plot.iterrows()):
yr_txt = "" if pd.isna(row.get("year")) else str(int(row["year"]))
mo_txt = "" if pd.isna(row.get("month")) else str(int(row["month"])).zfill(2)
ax.text(x_year, i, yr_txt, ha="center", va="center", fontsize=9)
ax.text(x_month, i, mo_txt, ha="center", va="center", fontsize=9)
plt.tight_layout()
savefig("top10_highest_risk_station_periods.png")
plt.show()
====================================================================== HIGH-RISK STATIONS ANALYSIS ====================================================================== High-risk threshold: 90th percentile of eb_risk_rate_per_100k_trips among credible rows Cutoff value: 628.72 High-risk rows: 1,646 Total exposure in high-risk rows: 12,567,625 trips Top 10 Highest-Risk (credible) station-periods:
| mode | year | month | station_id | station_name | station_lat | station_lng | trips | crash_count | eb_risk_rate_per_100k_trips | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1938 | nyc | 2019 | 6 | 3734 | E 58 St & 1 Ave (NW Corner) | 40.759125 | -73.962658 | 5008 | 116 | 2068.172197 |
| 308 | nyc | 2019 | 2 | 529.0 | W 42 St & 8 Ave | 40.757570 | -73.990985 | 5098 | 109 | 2035.034813 |
| 1843 | nyc | 2019 | 5 | 3712 | W 35 St & Dyer Ave | 40.754692 | -73.997402 | 5486 | 120 | 1999.473641 |
| 3297 | nyc | 2019 | 5 | 3749 | Lexington Ave & E 36 St | 40.747574 | -73.978801 | 5108 | 112 | 1992.237399 |
| 213 | nyc | 2019 | 2 | 401.0 | Allen St & Rivington St | 40.720196 | -73.989978 | 5270 | 109 | 1979.209296 |
| 802 | nyc | 2019 | 10 | 464 | E 56 St & 3 Ave | 40.759345 | -73.967597 | 5731 | 122 | 1934.592498 |
| 42 | nyc | 2019 | 12 | 529 | W 42 St & 8 Ave | 40.757570 | -73.990985 | 5672 | 112 | 1931.177105 |
| 609 | nyc | 2019 | 11 | 3236 | W 42 St & Dyer Ave | 40.758985 | -73.993800 | 5081 | 106 | 1926.525165 |
| 496 | nyc | 2019 | 1 | 529.0 | W 42 St & 8 Ave | 40.757570 | -73.990985 | 5319 | 105 | 1907.926813 |
| 690 | nyc | 2019 | 2 | 531.0 | Forsyth St & Broome St | 40.718939 | -73.992663 | 5339 | 104 | 1878.788813 |
Scorecard scope (rows by year-month): year month rows 2019 1 769 2019 2 767 2019 3 770 2019 4 786 2019 5 796 2019 6 795 2019 7 790 2019 8 795 2019 9 808 2019 10 840 2019 11 870 2019 12 881 2020 1 917 2020 2 908 2020 3 904 2020 4 900 2020 5 940 2020 6 978 2020 7 1008 2020 8 1055 2020 9 1106 2020 10 1162 2020 11 1166 2020 12 1189 2023 1 1817 2023 2 1872 2023 3 1898 2023 4 1912 2023 5 1910 2023 6 1923 2023 7 1964 2023 8 2014 2023 9 2060 2023 10 2180 2023 11 2103 2023 12 2202 2025 1 2252 2025 2 2244 2025 3 2247 2025 4 2247 2025 5 2245 2025 6 2207 2025 7 2206 2025 8 2198 2025 9 2213 2025 10 2203 2025 11 2222 2025 12 2227
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/top10_highest_risk_station_periods.png
In [3]:
# --- Risk vs Exposure: Multi-panel analysis ---
import math
if df_score is None or len(df_score) == 0:
print("No scorecard data — skipping panel plots")
else:
# Prepare data
plot_df = df_score.copy()
# Normalize columns
plot_df = plot_df.rename(columns={"exposure_trips": "trips"}, errors="ignore")
exposure_col = "trips"
rate_col = "eb_risk_rate_per_100k_trips"
crash_col = "crash_count" if "crash_count" in plot_df.columns else None
# Filter to credible
if "credibility_flag" in plot_df.columns:
plot_df = plot_df[plot_df["credibility_flag"] == "credible"].copy()
# Clean types
plot_df[exposure_col] = pd.to_numeric(plot_df[exposure_col], errors="coerce")
plot_df[rate_col] = pd.to_numeric(plot_df[rate_col], errors="coerce")
if crash_col:
plot_df[crash_col] = pd.to_numeric(plot_df[crash_col], errors="coerce").fillna(0)
plot_df = plot_df.dropna(subset=[exposure_col, rate_col])
plot_df = plot_df[plot_df[exposure_col] > 0].copy()
# Check if we have any data left
if len(plot_df) == 0:
print(f"No credible data for mode={mode} — skipping panel plots")
else:
has_period_cols = "year" in plot_df.columns and "month" in plot_df.columns
# Helper functions
def draw_scatter(ax, df_slice):
if crash_col:
return ax.scatter(
df_slice[exposure_col],
df_slice[rate_col],
c=df_slice[crash_col],
alpha=0.6,
s=30,
vmin=vmin,
vmax=vmax,
)
else:
ax.scatter(df_slice[exposure_col], df_slice[rate_col], alpha=0.6, s=30)
return None
def mark_hotspots(ax, df_slice):
if "prevention_hotspot" in df_slice.columns:
hotspots = df_slice[df_slice["prevention_hotspot"] == True]
if len(hotspots) > 0:
ax.scatter(
hotspots[exposure_col],
hotspots[rate_col],
color="red",
s=130,
marker="*",
edgecolors="black",
linewidths=1,
label="Prevention Hotspots",
zorder=5,
)
ax.legend(loc="best", fontsize=7)
def style_axes(ax, title):
ax.set_xlabel("Exposure (trips)", fontsize=9)
ax.set_ylabel("EB Risk Rate (per 100k trips)", fontsize=9)
ax.set_title(title, fontsize=10, fontweight="bold")
ax.set_xscale("log")
ax.grid(alpha=0.3)
# Global color scale
vmin = vmax = None
if crash_col and len(plot_df) > 0:
vmin, vmax = float(plot_df[crash_col].min()), float(plot_df[crash_col].max())
if not has_period_cols:
# Overall plot only
print(f"No year/month columns for mode={mode}. Making overall plot only.")
fig, ax = plt.subplots(figsize=(7.2, 4.8))
sc = draw_scatter(ax, plot_df)
mark_hotspots(ax, plot_df)
style_axes(ax, f"Risk vs Exposure (overall) — mode={mode}")
if sc is not None:
plt.colorbar(sc, ax=ax, label="Crash Count")
savefig(f"risk_vs_exposure_overall_mode{mode}.png")
plt.show()
else:
# Prepare year/month
plot_df["year"] = pd.to_numeric(plot_df["year"], errors="coerce")
plot_df["month"] = pd.to_numeric(plot_df["month"], errors="coerce")
plot_df = plot_df.dropna(subset=["year", "month"])
plot_df["year"] = plot_df["year"].astype(int)
plot_df["month"] = plot_df["month"].astype(int)
years_to_plot = sorted(plot_df["year"].unique())
months_to_plot = sorted(plot_df["month"].unique())
# Guard against empty data after filtering
if not years_to_plot or not months_to_plot:
print(f"No data after year/month filtering for mode={mode} — skipping panel plots")
else:
print(
f"[panel] mode={mode} years={years_to_plot} months={months_to_plot} rows={len(plot_df):,}"
)
# =========================================================
# PANEL A: One subplot per (year, month) - paginated
# =========================================================
pairs = [(y, m) for y in years_to_plot for m in months_to_plot]
total = len(pairs)
PANELS_PER_PAGE = 12
NCOLS = 3
NROWS = int(math.ceil(PANELS_PER_PAGE / NCOLS))
pages = int(math.ceil(total / PANELS_PER_PAGE))
print(f"[panelA] Total panels={total} -> pages={pages}")
for page_idx in range(pages):
start = page_idx * PANELS_PER_PAGE
page_pairs = pairs[start : start + PANELS_PER_PAGE]
figA, axesA = plt.subplots(
NROWS, NCOLS, figsize=(4.6 * NCOLS, 3.6 * NROWS), squeeze=False
)
scatter_for_cbar = None
for i, (y, m) in enumerate(page_pairs):
r, c = divmod(i, NCOLS)
ax = axesA[r][c]
d = plot_df[(plot_df["year"] == y) & (plot_df["month"] == m)]
sc = draw_scatter(ax, d)
mark_hotspots(ax, d)
style_axes(ax, f"{y}-{m:02d} (mode={mode})")
if scatter_for_cbar is None and sc is not None:
scatter_for_cbar = sc
# Turn off unused axes
for j in range(len(page_pairs), NROWS * NCOLS):
r, c = divmod(j, NCOLS)
axesA[r][c].axis("off")
figA.suptitle(
f"Risk vs Exposure — mode={mode} (page {page_idx + 1}/{pages})",
fontsize=13,
fontweight="bold",
)
figA.tight_layout(rect=[0.0, 0.0, 1.0, 0.93])
if scatter_for_cbar is not None:
import matplotlib as mpl
sm = mpl.cm.ScalarMappable(
norm=scatter_for_cbar.norm, cmap=scatter_for_cbar.cmap
)
sm.set_array([])
visible_axes = [
ax for ax in axesA.ravel() if ax.get_visible() and ax.has_data()
]
cbar = figA.colorbar(sm, ax=visible_axes, fraction=0.035, pad=0.02)
cbar.set_label("Crash Count")
savefig(f"risk_vs_exposure_panel_periods_mode{mode}_page{page_idx + 1:02d}.png")
plt.show()
# =========================================================
# PANEL B: Yearly comparison (overlay years) - one subplot per month
# =========================================================
n = len(months_to_plot)
ncols = min(3, n) if n > 1 else 1
nrows = int(math.ceil(n / ncols))
figB, axesB = plt.subplots(
nrows, ncols, figsize=(4.6 * ncols, 3.6 * nrows), squeeze=False
)
# Auto-select years to overlay (endpoints if too many)
if len(years_to_plot) > 3:
years_overlay = [min(years_to_plot), max(years_to_plot)]
overlay_tag = f"endpoints_{years_overlay[0]}_{years_overlay[1]}"
else:
years_overlay = years_to_plot
overlay_tag = "allYears"
for i, m in enumerate(months_to_plot):
r, c = divmod(i, ncols)
ax = axesB[r][c]
any_data = False
for y in years_overlay:
d = plot_df[(plot_df["year"] == y) & (plot_df["month"] == m)]
if d.empty:
continue
any_data = True
ax.scatter(d[exposure_col], d[rate_col], alpha=0.5, s=22, label=str(y))
mark_hotspots(ax, d)
style_axes(ax, f"Year comparison — month {m:02d} (mode={mode})")
if any_data:
ax.legend(title="Year", fontsize=8, title_fontsize=9, loc="best")
else:
ax.text(
0.5, 0.5, "No data", transform=ax.transAxes, ha="center", va="center"
)
ax.set_axis_off()
# Turn off unused axes
for j in range(n, nrows * ncols):
r, c = divmod(j, ncols)
axesB[r][c].axis("off")
figB.suptitle(
f"Yearly comparison (overlay: {overlay_tag}) — mode={mode}",
fontsize=13,
fontweight="bold",
)
figB.tight_layout(rect=[0.0, 0.0, 1.0, 0.93])
savefig(f"risk_vs_exposure_panel_year_compare_mode{mode}_{overlay_tag}.png")
plt.show()
[panel] mode=nyc years=[np.int64(2019), np.int64(2020), np.int64(2023), np.int64(2025)] months=[np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12)] rows=16,454 [panelA] Total panels=48 -> pages=4
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page01.png
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page02.png
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page03.png
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page04.png
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_year_compare_modenyc_endpoints_2019_2025.png
In [4]:
# --- Check if seasonality analysis is possible ---
can_do_seasonality = False
if df_score is None:
print("=" * 80)
print(" SEASONALITY ANALYSIS SKIPPED")
print("=" * 80)
print("No scorecard data available.")
elif "year" not in df_score.columns or "month" not in df_score.columns:
print("=" * 80)
print(" SEASONALITY ANALYSIS SKIPPED")
print("=" * 80)
print("No year/month columns in scorecard.")
else:
if "credibility_flag" in df_score.columns:
credible = df_score[df_score["credibility_flag"] == "credible"].copy()
else:
credible = df_score.copy()
if len(credible) == 0:
print("=" * 80)
print(" SEASONALITY ANALYSIS SKIPPED")
print("=" * 80)
print(f"No credible data for {mode} (min_trips threshold not met).")
print("This is expected for JC mode (smaller system).")
else:
can_do_seasonality = True
print(f"✓ Seasonality analysis available: {len(credible):,} credible rows")
✓ Seasonality analysis available: 16,454 credible rows
In [5]:
# --- Seasonality: line chart (median risk by month, per year) ---
if not can_do_seasonality:
print("Skipped")
else:
risk_col = (
"eb_risk_rate_per_100k_trips"
if "eb_risk_rate_per_100k_trips" in credible.columns
else "risk_rate_per_100k_trips"
)
# Aggregate: median + IQR per (year, month)
summary = (
credible.groupby(["year", "month"])[risk_col]
.agg(median="median", q25=lambda x: x.quantile(0.25), q75=lambda x: x.quantile(0.75))
.reset_index()
)
display(summary)
years = sorted(summary["year"].unique())
plt.figure(figsize=(10, 5))
for year in years:
sub = summary[summary["year"] == year].sort_values("month")
plt.plot(sub["month"], sub["median"], marker="o", linewidth=2, label=str(int(year)))
plt.fill_between(sub["month"], sub["q25"], sub["q75"], alpha=0.15)
plt.title(
f"Risk Seasonality (Median ± IQR) — mode={mode} ({run_label})",
fontsize=14,
fontweight="bold",
)
plt.xlabel("Month", fontsize=12)
plt.ylabel("EB Risk Rate (per 100k trips)", fontsize=12)
plt.xticks(range(1, 13))
plt.legend(title="Year")
plt.grid(True, alpha=0.3, linestyle="--")
plt.tight_layout()
savefig("03_seasonality_lines.png")
plt.show()
| year | month | median | q25 | q75 | |
|---|---|---|---|---|---|
| 0 | 2019 | 1 | 931.674908 | 703.777840 | 1163.266958 |
| 1 | 2019 | 2 | 895.390758 | 702.786556 | 1062.695457 |
| 2 | 2019 | 3 | 795.401601 | 587.429084 | 1044.317599 |
| 3 | 2019 | 4 | 605.555216 | 417.475105 | 821.939478 |
| 4 | 2019 | 5 | 653.168411 | 435.096762 | 895.536100 |
| 5 | 2019 | 6 | 569.709310 | 387.226379 | 841.979261 |
| 6 | 2019 | 7 | 536.728441 | 356.427404 | 755.142102 |
| 7 | 2019 | 8 | 474.230785 | 336.443408 | 684.216414 |
| 8 | 2019 | 9 | 505.657358 | 358.239322 | 730.963851 |
| 9 | 2019 | 10 | 541.362887 | 380.519044 | 801.176152 |
| 10 | 2019 | 11 | 651.113335 | 494.546484 | 871.402656 |
| 11 | 2019 | 12 | 824.500635 | 678.058527 | 1110.545835 |
| 12 | 2020 | 1 | 599.547443 | 503.391432 | 762.983857 |
| 13 | 2020 | 2 | 673.577322 | 511.836083 | 868.001991 |
| 14 | 2020 | 3 | 489.893837 | 348.230712 | 609.022613 |
| 15 | 2020 | 4 | 114.238527 | 89.956749 | 159.469111 |
| 16 | 2020 | 5 | 144.061003 | 100.948964 | 192.333876 |
| 17 | 2020 | 6 | 159.051868 | 109.662597 | 224.807339 |
| 18 | 2020 | 7 | 203.061124 | 144.667290 | 272.919101 |
| 19 | 2020 | 8 | 200.027953 | 134.095910 | 295.082461 |
| 20 | 2020 | 9 | 197.555810 | 136.425498 | 284.274431 |
| 21 | 2020 | 10 | 237.762271 | 176.528517 | 319.912636 |
| 22 | 2020 | 11 | 243.871772 | 191.837125 | 332.153269 |
| 23 | 2020 | 12 | 329.758871 | 262.120264 | 410.082584 |
| 24 | 2023 | 1 | 323.056529 | 252.382544 | 389.650108 |
| 25 | 2023 | 2 | 315.262634 | 249.117389 | 377.028163 |
| 26 | 2023 | 3 | 310.508507 | 238.036592 | 397.504807 |
| 27 | 2023 | 4 | 270.760531 | 192.402170 | 347.861949 |
| 28 | 2023 | 5 | 252.071002 | 179.770216 | 352.331997 |
| 29 | 2023 | 6 | 255.286132 | 181.410071 | 348.992079 |
| 30 | 2023 | 7 | 226.964927 | 163.251120 | 303.210564 |
| 31 | 2023 | 8 | 222.354278 | 160.865127 | 301.553168 |
| 32 | 2023 | 9 | 251.951900 | 172.147280 | 332.124428 |
| 33 | 2023 | 10 | 258.111241 | 185.157357 | 355.594613 |
| 34 | 2023 | 11 | 264.715850 | 202.572098 | 357.700757 |
| 35 | 2023 | 12 | 343.304628 | 255.790919 | 432.743387 |
| 36 | 2025 | 1 | 268.644577 | 201.293836 | 339.327696 |
| 37 | 2025 | 2 | 243.815682 | 189.264557 | 321.634549 |
| 38 | 2025 | 3 | 227.859349 | 165.105564 | 308.368454 |
| 39 | 2025 | 4 | 213.642906 | 152.560479 | 278.695432 |
| 40 | 2025 | 5 | 229.218154 | 159.952777 | 302.974368 |
| 41 | 2025 | 6 | 191.943842 | 140.091285 | 255.723631 |
| 42 | 2025 | 7 | 175.714952 | 122.485741 | 237.394042 |
| 43 | 2025 | 8 | 166.907529 | 116.882975 | 229.830537 |
| 44 | 2025 | 9 | 181.136789 | 124.492817 | 260.641381 |
| 45 | 2025 | 10 | 197.237487 | 136.353898 | 265.805491 |
| 46 | 2025 | 11 | 235.411327 | 176.371779 | 317.774910 |
| 47 | 2025 | 12 | 323.986555 | 249.000950 | 419.496738 |
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/03_seasonality_lines.png
In [6]:
# --- Seasonality: heatmap (year × month) ---
if not can_do_seasonality:
print("Skipped")
else:
risk_col = (
"eb_risk_rate_per_100k_trips"
if "eb_risk_rate_per_100k_trips" in credible.columns
else "risk_rate_per_100k_trips"
)
pivot = credible.groupby(["year", "month"])[risk_col].median().unstack()
fig, ax = plt.subplots(figsize=(10, 3 + 0.5 * len(pivot)))
im = ax.imshow(pivot.values, aspect="auto", cmap="YlOrRd")
ax.set_xticks(range(len(pivot.columns)))
ax.set_xticklabels([f"{int(m):02d}" for m in pivot.columns])
ax.set_yticks(range(len(pivot.index)))
ax.set_yticklabels([str(int(y)) for y in pivot.index])
ax.set_xlabel("Month", fontsize=12)
ax.set_ylabel("Year", fontsize=12)
ax.set_title(
f"Risk Heatmap (Year × Month) — mode={mode} ({run_label})", fontsize=14, fontweight="bold"
)
cbar = plt.colorbar(im, ax=ax, fraction=0.03)
cbar.set_label("Median EB Risk Rate")
# Add text labels
for i in range(len(pivot.index)):
for j in range(len(pivot.columns)):
val = pivot.iloc[i, j]
if not pd.isna(val):
color = "white" if val > pivot.values[~np.isnan(pivot.values)].mean() else "black"
ax.text(j, i, f"{val:.0f}", ha="center", va="center", color=color, fontsize=9)
plt.tight_layout()
savefig("04_seasonality_heatmap.png")
plt.show()
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/04_seasonality_heatmap.png
In [7]:
# --- Boxplot: risk distribution by year ---
if not can_do_seasonality:
print("Skipped")
else:
risk_col = (
"eb_risk_rate_per_100k_trips"
if "eb_risk_rate_per_100k_trips" in credible.columns
else "risk_rate_per_100k_trips"
)
years = sorted(credible["year"].unique())
data_by_year = [credible[credible["year"] == y][risk_col].dropna().values for y in years]
fig, ax = plt.subplots(figsize=(8, 5))
bp = ax.boxplot(data_by_year, tick_labels=[str(int(y)) for y in years], showfliers=False)
for median in bp["medians"]:
median.set_color("red")
median.set_linewidth(2)
ax.set_xlabel("Year", fontsize=12)
ax.set_ylabel("EB Risk Rate (per 100k trips)", fontsize=12)
ax.set_title(
f"Risk Distribution by Year — mode={mode} ({run_label})", fontsize=14, fontweight="bold"
)
ax.grid(axis="y", alpha=0.3, linestyle="--")
from matplotlib.lines import Line2D
ax.legend([Line2D([0], [0], color="red", linewidth=2)], ["Median"], loc="best")
plt.tight_layout()
savefig("05_distribution_boxplot.png")
plt.show()
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/05_distribution_boxplot.png
In [8]:
# --- Slopegraph: station-level change between years ---
if not can_do_seasonality:
print("Skipped")
else:
id_col = "start_station_id" if "start_station_id" in credible.columns else None
risk_col = (
"eb_risk_rate_per_100k_trips"
if "eb_risk_rate_per_100k_trips" in credible.columns
else "risk_rate_per_100k_trips"
)
exp_col = "exposure_trips" if "exposure_trips" in credible.columns else "trips"
if id_col is None:
print("No station ID column — skipping slopegraph")
else:
years = sorted(credible["year"].unique())
if len(years) < 2:
print("Need at least 2 years for slopegraph")
else:
y0, y1 = min(years), max(years)
# Aggregate: weighted average across months
def weighted_avg(group):
weights = group[exp_col].values
values = group[risk_col].values
if weights.sum() == 0:
return np.nan
return np.average(values, weights=weights)
agg = credible.groupby([id_col, "year"]).apply(weighted_avg).reset_index()
agg.columns = [id_col, "year", "risk"]
df0 = agg[agg["year"] == y0][[id_col, "risk"]].rename(columns={"risk": "risk0"})
df1 = agg[agg["year"] == y1][[id_col, "risk"]].rename(columns={"risk": "risk1"})
merged = df0.merge(df1, on=id_col).dropna()
if len(merged) == 0:
print(f"No stations present in both {y0} and {y1}")
else:
merged = merged.head(100) # Limit for readability
fig, ax = plt.subplots(figsize=(8, 6))
for _, row in merged.iterrows():
ax.plot(
[0, 1], [row["risk0"], row["risk1"]], alpha=0.2, color="gray", linewidth=1
)
ax.plot(
[0, 1],
[merged["risk0"].median(), merged["risk1"].median()],
color="red",
linewidth=3,
label="Median",
)
ax.set_xticks([0, 1])
ax.set_xticklabels([str(y0), str(y1)])
ax.set_ylabel("EB Risk Rate (per 100k trips)", fontsize=12)
ax.set_title(
f"Station-Level Risk Change — mode={mode} ({run_label})",
fontsize=14,
fontweight="bold",
)
ax.legend()
ax.grid(True, alpha=0.3, linestyle="--")
plt.tight_layout()
savefig("06_station_change_slopegraph.png")
plt.show()
improved = (merged["risk1"] < merged["risk0"]).sum()
worsened = (merged["risk1"] > merged["risk0"]).sum()
print(f"\nStations improved: {improved}")
print(f"Stations worsened: {worsened}")
No stations present in both 2019 and 2025
/tmp/ipykernel_5664/1551659767.py:31: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. agg = credible.groupby([id_col, "year"]).apply(weighted_avg).reset_index()
In [9]:
# --- Summary ---
print("=" * 60)
print(f"RISK ANALYSIS SUMMARY — mode={mode} ({run_label})")
print("=" * 60)
if df_score is None:
print("No scorecard data available")
else:
exp_col = "exposure_trips" if "exposure_trips" in df_score.columns else "trips"
risk_col = (
"eb_risk_rate_per_100k_trips"
if "eb_risk_rate_per_100k_trips" in df_score.columns
else "risk_rate_per_100k_trips"
)
print(f"\nTotal rows in scorecard: {len(df_score):,}")
if "credibility_flag" in df_score.columns:
n_credible = (df_score["credibility_flag"] == "credible").sum()
print(f"Credible station-periods: {n_credible:,}")
print(f"\nExposure (trips):")
print(f" Total: {df_score[exp_col].sum():,.0f}")
print(f" Median per station-period: {df_score[exp_col].median():,.0f}")
print(f"\nRisk ({risk_col}):")
print(f" Mean: {df_score[risk_col].mean():.2f}")
print(f" Median: {df_score[risk_col].median():.2f}")
print(f" Std: {df_score[risk_col].std():.2f}")
if "prevention_hotspot" in df_score.columns:
n_hotspots = df_score["prevention_hotspot"].sum()
print(f"\nPrevention hotspots: {n_hotspots:,}")
if "eb_m_prior_used" in df_score.columns:
m_val = df_score["eb_m_prior_used"].iloc[0]
print(f"\nEB prior strength (m): {m_val:,.0f}")
print("\n" + "=" * 60)
============================================================ RISK ANALYSIS SUMMARY — mode=nyc (y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc) ============================================================ Total rows in scorecard: 72,466 Credible station-periods: 16,454 Exposure (trips): Total: 241,616,968 Median per station-period: 1,721 Risk (eb_risk_rate_per_100k_trips): Mean: 954.59 Median: 773.83 Std: 761.23 Prevention hotspots: 1,836 EB prior strength (m): 1,000 ============================================================